blah blah

This file belongs to the repository: https://github.com/drisso/awst_analysis.

The code is released with license GPL v3.0.

Benchmarking single cell RNA-sequencing analysis pipelines using mixture control experiments (Nature)

Supplementary Figure 3 Visualization of representative benchmarking datasets using t-SNE and UMAP and violin plot of the number of doublets.

  1. t-SNE and UMAP visualizations of 4 datasets. From left to right: 10X single cell using 3 cell lines; 10X single cell using 5 cell lines; a CEL-seq2 cell mixture and a CEL-seq2 RNA mixture. Each point represents a cell or ‘pseudo cell’ and the number of cells in each plot is indicated in the title. (b) Violin plot of doublets in each dataset, identified using Demuxlet (DBL: doublet, SNG: single cell). The number of single cells and doublets are shown on top of each violin. Doublets were excluded when calculating the performance metrics.

Supplementary Figure 5: Example clustering results and summary of clustering performance using ARI and the number of clusters.

  1. Examples of clustering results visualized by PCA (top) and t-SNE (bottom), with different colours representing the cluster assignments made by the selected method. The sample sizes are 340 for RNAmix_CEL-seq2, 285 for cellmix3 and 274 for sc_CEL-seq2. Coefficients obtained from linear models fitted using the ARI (b) or the number of clusters (c) as dependent variables, and experimental design, normalization methods, imputation methods and clustering methods as covariates. The coefficients measure whether particular features have positive or negative associations with the dependent variables.

Install and load awst

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("drisso/awst")
#BiocManager::install("GIS-SP-Group/RCA")
#BiocManager::install("hgu133plus2.db")

Setup

rm(list = ls())
#library(steFunctions)
library(dendextend)
library(clues)
#setwd("~/Dropbox/AWST/mixology/")
jobName <- "mixoloy20200818"
source(url("https://raw.githubusercontent.com/drisso/awst_analysis/master/functions.R"))
#####
save_plots <- FALSE
png_width_large <- 2100
png_height_large <- 500
png_width_small <- width_png <- 700
png_height_small <- 700
png_res <- 1/300
####
true_number_of_clusters <- c(3, 5, 7, 10, 10)
names(true_number_of_clusters) <- c("sc_10x", "sc_10x_5cl", "RNAmix_celseq2", "cellmix4", "cellmix3")
####
results <- matrix(NA, ncol = 8, nrow = 10)
colnames(results) <- c("where", "what", "ARI", "accuracy", "purity", "avg", "noOfClust_Est", "noOfClust_Th") 
#results <- data.frame(where = rep("", 30), what = "", ARI = NA, accuracy = NA, purity = NA, noOfClust = NA)
k <- 0

sc_10x_5cl

con <- gzcon(url("https://github.com/LuyiTian/sc_mixology/blob/master/data/csv/sc_10x_5cl.metadata.csv.gz?raw=true"))
annotation.df <- read.csv(textConnection(readLines(con)))
annotation.df$cell.col <- factor(annotation.df$cell_line)
levels(annotation.df$cell.col) <- c("gold", "red", "blue", "magenta", "green3")
annotation.df$cell.col <- as.character(annotation.df$cell.col)
annotation.df$cell_line <- paste0("mix", annotation.df$mix)

con <- gzcon(url("https://github.com/LuyiTian/sc_mixology/blob/master/data/csv/sc_10x_5cl.count.csv.gz?raw=true"))
ddata <- read.csv(textConnection(readLines(con)))

both <- intersect(colnames(ddata), rownames(annotation.df))
ddata <- ddata[, both]
annotation.df <- annotation.df[both,]
prefix <- "sc_10x_5cl"
save(ddata, annotation.df, prefix, file = paste0(prefix, "_counts.RData"))
require(awst)
require(EDASeq)
require(Rtsne)
require(umap)
require(SingleCellExperiment)
require(clusterExperiment)
############################
#load(paste0(prefix, "_counts.RData"))
#load(paste0(prefix, "_expression.RData"))
#######################
no_of_detected_gene_per_sample <- colSums(ddata > 0) 
fivenum(no_of_detected_gene_per_sample)
ddata <- EDASeq::betweenLaneNormalization(as.matrix(ddata), which = "full", round = FALSE)
# apply the AWS-transformation 
tmp <- rowSums(ddata)
sum(tmp > 0)
tmp <- colSums(ddata)
sum(tmp > 0)

exprData <- awst(ddata, poscount = TRUE, full_quantile = TRUE)
sum(is.na(rowSums(exprData)))
sum(is.na(colSums(exprData)))

save(exprData, prefix, file = paste0(prefix, "_expression.RData"))
#dim(exprData <- gene_filter(exprData))
#write.table(exprData, file =  paste0(prefix, "_expression.tsv"), sep = "\t")

nrow_exprData <- nrow(exprData)
ncol_exprData <- ncol(exprData)

library(irlba)
pprcomp <- prcomp_irlba(exprData, 5) # Run PC analysis

set.seed(2020) 
ans_Rtsne <- Rtsne(exprData, pca = FALSE) # Run TSNE

set.seed(2020) 
ans_umap <- umap(exprData) # Run Umap
#ans_RaceID <- ans_RaceID_raw <- NULL
#if(prefix %in% c("RNAmix_celseq2", "cellmix3", "cellmix4")) {
require(SingleCellExperiment)

if(FALSE) {
  tmp <- get_RCA()
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_RCA_raw  <- colData(tmp)[, -wwhich]

  tmp <- get_RCA(is_awst = TRUE)
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_RCA_awst <- colData(tmp)[, -wwhich]

  tmp <- get_RaceID()
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_RaceID_raw <- colData(tmp)[, -wwhich]
  
  tmp <- get_RaceID(is_awst = TRUE)
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_RaceID_awst <- colData(tmp)[, -wwhich]
  
  tmp <- get_SC3()
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_SC3_raw <- colData(tmp)[, -wwhich]
  
  tmp <- get_SC3(is_awst = TRUE)
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_SC3_awst  <- colData(tmp)[, -wwhich]
  
  tmp <- get_Seurat()
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_Seurat_raw_LoR  <- colData(tmp)[, -wwhich]

  tmp <- get_Seurat(is_awst = TRUE)
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_Seurat_awst_LoR  <- colData(tmp)[, -wwhich]

  tmp <- get_Seurat(resolution = 1.6)
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_Seurat_raw_HiR  <- colData(tmp)[, -wwhich]

  tmp <- get_Seurat(is_awst = TRUE, resolution = 1.6)
  wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
  ans_Seurat_awst_HiR  <- colData(tmp)[, -wwhich]
}

tmp <- get_clusterExperiment()
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_clusterExp_raw <- colData(tmp)[, -wwhich, drop=FALSE]

tmp <- get_clusterExperiment(is_awst = TRUE)
wwhich <- which(colnames(colData(tmp)) %in% colnames(annotation.df))
ans_clusterExp_awst <- colData(tmp)[, -wwhich, drop=FALSE]

save(nrow_exprData, ncol_exprData, annotation.df,
     pprcomp, ans_Rtsne, ans_umap, prefix, 
     ans_clusterExp_awst, ans_clusterExp_raw,
     file = paste0(prefix, "_expression_working.RData"))

PCA of sc_10x_5cl

t-SNE of sc_10x_5cl

umap of sc_10x_5cl

counts + clusterExperiment of sc_10x_5cl

## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## estimated no. of clusters: 40
## adjusted Rand-index: 0.8362
## cluster purity: 0.2468
## cluster accuracy: 0.0503

awst + clusterExperiment of sc_10x_5cl

## estimated no. of clusters: 28
## adjusted Rand-index: 0.8377
## cluster purity: 0.221
## cluster accuracy: 0.0238

sc_10x

PCA of sc_10x

t-SNE of sc_10x

umap of sc_10x

counts + clusterExperiment of sc_10x

## estimated no. of clusters: 25
## adjusted Rand-index: 0.725
## cluster purity: 0.3336
## cluster accuracy: 0.0136

awst + clusterExperiment of sc_10x

## estimated no. of clusters: 22
## adjusted Rand-index: 0.7413
## cluster purity: 0.2962
## cluster accuracy: 0.0091

RNAmix_celseq2

PCA of RNAmix_celseq2

t-SNE of RNAmix_celseq2

umap of RNAmix_celseq2

counts + clusterExperiment of RNAmix_celseq2

## estimated no. of clusters: 19
## adjusted Rand-index: 0.8868
## cluster purity: 0.3371
## cluster accuracy: 0.1493

awst + clusterExperiment of RNAmix_celseq2

## estimated no. of clusters: 12
## adjusted Rand-index: 0.9007
## cluster purity: 0.1937
## cluster accuracy: 0.1863

cellmix3

## 
## 009 018 027 036 045 054 063 072 081 090 108 111 117 171 180 207 225 252 270 306 
##  18   4   4   5   4   4   4   4   3  10   5   2  18  18   4   5  20  17   4   5 
## 333 360 405 450 504 522 540 603 630 702 711 720 801 810 900 
##  30   5   3   4   5  17   4   5   2   4  16   4   4   5  19
## 
##   balanced      H1975  H1975High   H1975Low      H2228  H2228High   H2228Low 
##         32         19         33         33         10         34         33 
##     HCC827 HCC827High  HCC827Low 
##         18         37         36
## Loading required package: awst
## Loading required package: EDASeq
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: ShortRead
## Loading required package: BiocParallel
## Loading required package: Biostrings
## Loading required package: IRanges
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:dendextend':
## 
##     nnodes
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
## Loading required package: Rtsne
## Loading required package: umap
## Loading required package: SingleCellExperiment
## Loading required package: clusterExperiment
## 
## Attaching package: 'clusterExperiment'
## The following object is masked from 'package:clues':
## 
##     plotClusters
##   J2   G6  P19  M12  E17 
## 1604 2660 3371 4205 8325
## [1] 14052
## [1] 285
## [1] 0
## [1] 0
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## Note: Merging will be done on ' makeConsensus ', with clustering index 1
## Warning: Zero sample variances detected, have been offset away from zero
## Note: Merging will be done on ' makeConsensus ', with clustering index 1

PCA of cellmix3

t-SNE of cellmix3

umap of cellmix3

counts + clusterExperiment of cellmix3

## estimated no. of clusters: 11
## adjusted Rand-index: 0.8274
## cluster purity: 0.3762
## cluster accuracy: 0.4554

awst + clusterExperiment of cellmix3

## estimated no. of clusters: 2
## adjusted Rand-index: 0.4204
## cluster purity: 0.1182
## cluster accuracy: 0.4228

cellmix4

## 
##   balanced      H1975  H1975High   H1975Low      H2228  H2228High   H2228Low 
##         33         19         33         33         10         34         34 
##     HCC827 HCC827High  HCC827Low 
##         18         38         36
##      H9     H16     C10     A19      I6 
##  2382.0  4022.0  4914.5  5906.0 11816.0
## [1] 15146
## [1] 288
## [1] 0
## [1] 0
## Note: Merging will be done on ' makeConsensus ', with clustering index 1
## Warning: Zero sample variances detected, have been offset away from zero
## Note: Merging will be done on ' makeConsensus ', with clustering index 1

PCA of cellmix4

t-SNE of cellmix4

umap of cellmix4

counts + clusterExperiment of cellmix4

## estimated no. of clusters: 13
## adjusted Rand-index: 0.8412
## cluster purity: 0.4182
## cluster accuracy: 0.458

awst + clusterExperiment of cellmix4

## estimated no. of clusters: 2
## adjusted Rand-index: 0.3134
## cluster purity: 0.0854
## cluster accuracy: 0.4259

Table

where what ARI accuracy purity avg noOfClust_Est noOfClust_Th
sc_10x_5cl clusterExperiment + counts 0.8362 0.0503 0.2468 0.8426 40 5
sc_10x_5cl clusterExperiment + AWST 0.8377 0.0238 0.2210 0.8604 28 5
sc_10x clusterExperiment + counts 0.7250 0.0136 0.3336 0.7811 25 3
sc_10x clusterExperiment + AWST 0.7413 0.0091 0.2962 0.8026 22 3
RNAmix_celseq2 clusterExperiment + counts 0.8868 0.1493 0.3371 0.7937 19 7
RNAmix_celseq2 clusterExperiment + AWST 0.9007 0.1863 0.1937 0.8392 12 7
cellmix3 clusterExperiment + counts 0.8274 0.4554 0.3762 0.6551 11 10
cellmix3 clusterExperiment + AWST 0.4204 0.4228 0.1182 0.5981 2 10
cellmix4 clusterExperiment + counts 0.8412 0.4580 0.4182 0.6425 13 10
cellmix4 clusterExperiment + AWST 0.3134 0.4259 0.0854 0.548 2 10

(*) with at least a cluster of size 1.

Session info

sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] irlba_2.3.3                 Matrix_1.2-18              
##  [3] clusterExperiment_2.8.0     SingleCellExperiment_1.10.1
##  [5] umap_0.2.5.0                Rtsne_0.15                 
##  [7] EDASeq_2.22.0               ShortRead_1.46.0           
##  [9] GenomicAlignments_1.24.0    SummarizedExperiment_1.18.2
## [11] DelayedArray_0.14.1         matrixStats_0.56.0         
## [13] Rsamtools_2.4.0             GenomicRanges_1.40.0       
## [15] GenomeInfoDb_1.24.2         Biostrings_2.56.0          
## [17] XVector_0.28.0              IRanges_2.22.2             
## [19] BiocParallel_1.22.0         Biobase_2.48.0             
## [21] awst_0.0.4                  S4Vectors_0.26.1           
## [23] BiocGenerics_0.34.0         clues_0.6.2.2              
## [25] dendextend_1.13.4           knitr_1.28                 
## [27] BiocManager_1.30.10        
## 
## loaded via a namespace (and not attached):
##   [1] uuid_0.1-4             aroma.light_3.18.0     BiocFileCache_1.12.1  
##   [4] NMF_0.22.0             plyr_1.8.6             lazyeval_0.2.2        
##   [7] splines_4.0.0          rncl_0.8.4             ggplot2_3.3.1         
##  [10] gridBase_0.4-7         digest_0.6.25          foreach_1.5.0         
##  [13] htmltools_0.4.0        viridis_0.5.1          magrittr_1.5          
##  [16] memoise_1.1.0          cluster_2.1.0          doParallel_1.0.15     
##  [19] limma_3.44.3           annotate_1.66.0        R.utils_2.9.2         
##  [22] askpass_1.1            prettyunits_1.1.1      jpeg_0.1-8.1          
##  [25] colorspace_1.4-1       blob_1.2.1             rappdirs_0.3.1        
##  [28] xfun_0.14              dplyr_1.0.0            crayon_1.3.4          
##  [31] RCurl_1.98-1.2         jsonlite_1.6.1         genefilter_1.70.0     
##  [34] phylobase_0.8.10       survival_3.1-12        iterators_1.0.12      
##  [37] ape_5.4                glue_1.4.1             registry_0.5-1        
##  [40] gtable_0.3.0           zlibbioc_1.34.0        kernlab_0.9-29        
##  [43] Rhdf5lib_1.10.1        HDF5Array_1.16.1       scales_1.1.1          
##  [46] DESeq_1.39.0           DBI_1.1.0              edgeR_3.30.3          
##  [49] rngtools_1.5           bibtex_0.4.2.2         Rcpp_1.0.4.6          
##  [52] viridisLite_0.3.0      xtable_1.8-4           progress_1.2.2        
##  [55] reticulate_1.16        bit_1.1-15.2           httr_1.4.1            
##  [58] RColorBrewer_1.1-2     ellipsis_0.3.1         pkgconfig_2.0.3       
##  [61] XML_3.99-0.3           R.methodsS3_1.8.0      dbplyr_1.4.4          
##  [64] locfit_1.5-9.4         softImpute_1.4         howmany_0.3-1         
##  [67] tidyselect_1.1.0       rlang_0.4.6            reshape2_1.4.4        
##  [70] AnnotationDbi_1.50.3   munsell_0.5.0          tools_4.0.0           
##  [73] generics_0.0.2         RSQLite_2.2.0          ade4_1.7-15           
##  [76] evaluate_0.14          stringr_1.4.0          yaml_2.2.1            
##  [79] bit64_0.9-7            purrr_0.3.4            nlme_3.1-148          
##  [82] R.oo_1.23.0            xml2_1.3.2             biomaRt_2.44.1        
##  [85] compiler_4.0.0         curl_4.3               png_0.1-7             
##  [88] tibble_3.0.1           geneplotter_1.66.0     RNeXML_2.4.4          
##  [91] stringi_1.4.6          highr_0.8              GenomicFeatures_1.40.1
##  [94] RSpectra_0.16-0        lattice_0.20-41        vctrs_0.3.1           
##  [97] pillar_1.4.4           lifecycle_0.2.0        zinbwave_1.10.0       
## [100] bitops_1.0-6           rtracklayer_1.48.0     R6_2.4.1              
## [103] latticeExtra_0.6-29    hwriter_1.3.2          gridExtra_2.3         
## [106] codetools_0.2-16       MASS_7.3-51.6          assertthat_0.2.1      
## [109] rhdf5_2.32.2           openssl_1.4.1          pkgmaker_0.31.1       
## [112] withr_2.2.0            GenomeInfoDbData_1.2.3 locfdr_1.1-8          
## [115] hms_0.5.3              grid_4.0.0             tidyr_1.1.0           
## [118] rmarkdown_2.2
knitr::knit_exit()